OutFLANK: Fst outliers with trimming

Documented in MakeDiploidFSTMat WC_FST_Diploids_2Alleles

####Diploid Fst functions for OutFLANK

####### Fst functions for OutFLANK
#' 
#' Calculates FST both with and without correction for local sample sizes, for diploid biallelic data. Based on Weir and Cockerham (1984)
#' 
#' @title FST calculation for biallelic diploid data
#'
#' @param Sample_Mat This is an array with a row for each population. There should be three columns, with the numbers of individuals from that population which are homozygotes for one allele, heterozygotes, and homozygotes for the other allele. 
#' 
#' @return Returns a list of values related to FST:
#'  \itemize{
#'  \item   He:  The expected heterozygosity of the locus
#'  \item   FST:  Fst (with sample size correction)
#'  \item   T1: The numerator of the Fst calculation
#'  \item   T2: The denominator of the Fst calculation
#'  \item   FSTNoCorr:  Fst (without sample size correction)
#'  \item   T1NoCorr: The numerator of the Fst calculation without sample size correction
#'  \item   T2NoCorr: The denominator of the Fst calculation without sample size correction
#'  \item   meanAlleleFreq: The mean allele frequency over all populations for this locus
#'  }
#'@export
#'  
WC_FST_Diploids_2Alleles<-function(Sample_Mat){
  ##Calculate both Fst and Fst NoCorr at the same time, from WC84
  #Sample Mat has three columns (homo_p,m heterozygotes, and homo_q) and a row for each population
  
  sample_sizes = rowSums(Sample_Mat)
  n_ave = mean(sample_sizes)
  n_pops = nrow(Sample_Mat) #r
  r = n_pops
  n_c = (n_pops*n_ave - sum(sample_sizes^2)/(n_pops*n_ave))/(n_pops-1)
  p_freqs = (Sample_Mat[,1] + Sample_Mat[,2]/2) /sample_sizes
  p_ave = sum(sample_sizes*p_freqs)/(n_ave*n_pops)
  
  s2 = sum(sample_sizes*(p_freqs - p_ave)^2)/((n_pops-1)*n_ave)
  if(s2==0){return(0); break}  
  
  h_freqs = Sample_Mat[,2]/sample_sizes
  h_ave = sum(sample_sizes*h_freqs)/(n_ave*n_pops)
  
  a <- n_ave/n_c*(s2 - 1/(n_ave-1)*(p_ave*(1-p_ave)-((r-1)/r)*s2-(1/4)*h_ave))
  
  b <- n_ave/(n_ave-1)*(p_ave*(1-p_ave) - (r-1)/r*s2 - (2*n_ave - 1)/(4*n_ave)*h_ave)
  
  c <- 1/2*h_ave
  
  aNoCorr <- n_ave/n_c*(s2)
  
  bNoCorr <- (p_ave*(1-p_ave) - (r-1)/r*s2 - (2*n_ave)/(4*n_ave)*h_ave)
  
  cNoCorr <- 1/2*h_ave
  
  He <- 1-sum(p_ave^2, (1-p_ave)^2)
  
  FST <- a/(a+b+c) 
  FSTNoCorr = aNoCorr/(aNoCorr+bNoCorr+cNoCorr)
  return(list(He=He,FST=FST, T1=a, T2=(a+b+c),FSTNoCorr=FSTNoCorr, T1NoCorr=aNoCorr, T2NoCorr=(aNoCorr+bNoCorr+cNoCorr),meanAlleleFreq = p_ave))
}

####### Create input file for OutFLANK
#' 
#' Creates OutFLANK input file from individual genotype info. 
#' 
#' @title Create OutFLANK input file
#'
#' @param SNPmat This is an array of genotypes with a row for each individual. There should be a column for each SNP, with the number of copies of the focal allele (0, 1, or 2) for that individual. If that individual is missing data for that SNP, there should be a 9, instead. 
#' @param locusNames A list of names for each SNP locus. There should be the same number of locus names as there are columns in SNPmat.
#' @param popNames A list of population names to give location for each individual. Typically multiple individuals will have the same popName. The list popNames should have the same length as the number of rows in SNPmat.
#' 
#' @return Returns a data frame in the form needed for the main OutFLANK function.
#' 
#'@export
#'  
MakeDiploidFSTMat = function(SNPmat,locusNames,popNames){
  # SNPmat is a matrix with individuals in rows and snps in columns
  # 0, 1, or 2 represent the number of copies of the focal allele, and 9 is for missing data
  # locusNames is a character vector of names of each SNP
  # popNames is a character vector with the population identifier for each individual 
  
  locusname <- unlist(locusNames)
  popname <- unlist(popNames)
  
  ### Check that SNPmat has appropriate values (0, 1, 2, or 9, only)
  snplevs <- levels(as.factor(unlist(SNPmat)))
  ls <- paste(snplevs, collapse="")
  if(ls!="012" & ls!="0129"){print("Error: Your snp matrix does not have 0,1, and 2"); break}
  
  ### Checking that locusNames and popNames have the same lengths as the columns and rows of SNPmat
  if(dim(SNPmat)[1]!=length(popname) ){
    print("Error: your population names do not match your SNP matrix")
    break}
  
  if(dim(SNPmat)[2]!=length(locusname)){
    print("Error:  your locus names do not match your SNP matrix")
    break}
  
  writeLines("Calculating FSTs, may take a few minutes...")
  
  nloci <- length(locusname)
  FSTmat <- matrix(NA, nrow=nloci, ncol=8)
  for (i in 1:nloci){
    FSTmat[i,]=unlist(getFSTs_diploids(popname,SNPmat[,i]))
    if (i%%10000==0){print(paste(i, "done of", nloci))}
  }
  outTemp=as.data.frame(FSTmat)
  outTemp = cbind(locusname,outTemp)
  
  colnames(outTemp)= c("LocusName","He", "FST", "T1", "T2", "FSTNoCorr", "T1NoCorr", "T2NoCorr", "meanAlleleFreq")
  return (outTemp)
  
}


### Calculates FST etc. from a single locus from a column of individual data
getFSTs_diploids = function(popNameList, SNPDataColumn){  
  #eliminating the missing data for this locus
  popnames=unlist(as.character(popNameList))
  popNameTemp=popnames[which(SNPDataColumn!=9)]
  snpDataTemp=SNPDataColumn[SNPDataColumn!=9]
  
  HetCounts <- tapply(snpDataTemp, list(popNameTemp,snpDataTemp), length)
  HetCounts[is.na(HetCounts)] = 0
  
  #Case: all individuals are genetically identical at this locus
  if(dim(HetCounts)[2]==1){
    return (list(He=NA,FST=NA, T1=NA, T2=NA,FSTNoCorr=NA, T1NoCorr=NA, T2NoCorr=NA,meanAlleleFreq = NA))
  }
  
  if(dim(HetCounts)[2]==2){
    if(paste(colnames(HetCounts),collapse="")=="01"){HetCounts=cbind(HetCounts,"2"=0)}
    if(paste(colnames(HetCounts),collapse="")=="12"){HetCounts=cbind("0"=0,HetCounts)} 
    if(paste(colnames(HetCounts),collapse="")=="02"){HetCounts=cbind(HetCounts[,1],"1"=0, HetCounts[,2])}
  }
  
  out = WC_FST_Diploids_2Alleles(HetCounts)	
  return(out)
}